Overview
Current file presents a workflow of the analysis made so far for the project. Though basically it was made for visualization, the main processing functions & code blocks are also present within a file, and can be verified or tested by unfolding buttons on the right.
Loadings
These are auxillary functions for library loadings and plot
saving.
settings.r contains most of the static parameters,
dictionaries and conditions.
wd <- getwd() # Set the working directory manually if launching in R GUI
source("load_pckgs.r") # Load the library loader function
source("settings.r") # Load the settings
source("plot_saver.r") # Load the function saving the plots to png
Settings:
# file_names & paths
info_and_carb_data_file_name <- "Data.xlsx" # the name of your excel file with info data
petro_data_file_name <- "petrography.xlsx" # the name for petrgography data file
gcms_data_file_name <- "Preliminary analysis Shang and Zhou.xlsx" # the name for gcms data
inpath <- file.path(wd, "input")
outpath <- file.path(wd, "output")
path <- "p:/2025-vessels/spectra/mz"
# plot settings
parts <- c("lip", "neck", "shoulder", "body", "foot", "crotch") # sequence of vessel parts
gap <- 250 # gap used for plotting
cex <- 10 # font size setting
drywetColors <- c("WET" = "#1f77b4", "DRY" = "#ff7f0e") # colors for dry and wet in pie charts
na_col <- "#b8241a" # color for a missing vessel part
# list of modes for carb_plotter function
file_format <- "pdf" # preferred file format for plots
# procesing setting
print_res <- TRUE
excluded_vars <- c("c.conf", "g.conf")
# dictionaries
# The rule set for dry/wet assignment
dry_wet_conditions <- data.frame(
query_string = c(
"col_in_neck == 'carb' & col_in_shoulder == 'carb' & col_in_body == 'carb'",
"col_in_neck == 'clear' & col_in_shoulder == 'carb' & col_in_body == 'carb'",
"col_in_neck != 'water' & !is.na(col_in_neck) & col_in_shoulder != 'water' & !is.na(col_in_shoulder) & col_in_body == 'carb'",
"col_in_neck == 'water' | col_in_shoulder == 'water' | col_in_body == 'water'",
"col_in_neck == 'carb' & !is.na(col_in_shoulder) & col_in_body == 'clear'",
"col_in_neck == 'clear' & col_in_shoulder == 'carb' & col_in_body == 'clear'",
"col_in_neck == 'clear' & col_in_shoulder == 'clear' & col_in_body == 'clear'",
"col_in_foot == 'water'",
"TRUE"
),
value = c(-3, -2, -1, 3, 2, 2, 1, 1, 0),
header = c(
"DRY, level 3", "DRY, level 2", "DRY, level 1", "WET, level 3", "WET, level 2", "WET, level 2", "WET, level 1", "WET, level 1", "na"
),
help = c(
"from neck to body: CARB CARB CARB", "from neck to body: CLEAR CARB CARB", "from neck to body: CLEAR CLEAR CARB", "from neck to body: ANY is WATER", "from neck to body: CARB ANY CLEAR", "from neck to body: CLEAR CARB CLEAR", "from neck to body: CLEAR CLEAR CLEAR", "foot is water and body is absent", "the rest combinations"
)
)
# Variable descriptions
var_info <- list(
"id" = list(type = "text id", units = NULL, options = NULL, descr = "Unique sample id"),
"i.site" = list(type = "txt fct", units = NULL, options = c("Gaoqingcaopo", "Kanjiazhai", "Yangxinliwu"), descr = "The name of the site"),
"i.period" = list(type = "txt ord", units = NULL, options = c("Shang", "Zhou"), descr = "The dynastie"),
"i.lip" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if lip is present in a sample"),
"i.neck" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if neck is present in a sample"),
"i.shoulder" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if shoulder is present in a sample"),
"i.body" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if body is present in a sample"),
"i.foot" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if foot is present in a sample"),
"i.crotch" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if crotch is present in a sample"),
"i.completeness" = list(type = "num dbl", units = "%%", options = NULL, descr = "100 means all parts are present in a sample, less means a fragment is missing some parts"),
"i.collection" = list(type = "txt fct", units = NULL, options = NULL, descr = "A combination of period and a first letter of the site"),
"m.body_width" = list(type = "num dbl", units = "mm", options = NULL, descr = "Width of the body"),
"m.height" = list(type = "num dbl", units = "mm", options = NULL, descr = "Height of the vessel fragment"),
"m.rim" = list(type = "num dbl", units = "mm", options = NULL, descr = "The size of the rim"),
"m.orifice" = list(type = "num dbl", units = "mm", options = NULL, descr = "The size of the orifice"),
"m.cord_mark_width" = list(type = "num dbl", units = "mm", options = NULL, descr = "The width of a cordmark"),
"m.cord_mark_type" = list(type = "txt fct", units = "mm", options = NULL, descr = "Cordmark type"),
"c.col_in_lip" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the lip part from inside"),
"c.col_in_neck" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the neck part from inside"),
"c.col_in_shoulder" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the shoulder part from inside"),
"c.col_in_body" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the body part from inside"),
"c.col_out_lip" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the lip part from outside"),
"c.col_out_neck" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the neck part from outside"),
"c.col_out_shoulder" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the shoulder part from outside"),
"c.col_out_body" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the body part from outside"),
"c.col_in_foot" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the foot part from inside"),
"c.col_out_foot" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the foot part from outside"),
"c.col_out_crotch" = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the crotch part from outside"),
"c.group" = list(type = "txt ord", units = NULL, options = c("dry", "wet"), descr = "A classification according to carbonisation analysis"),
"c.conf" = list(type = "num dbl", units = "%", options = c(0.3, 0.5, 1), descr = "A confidence level for 'Dry or Wet' classification, 1 means dead certainty"),
"g.lipids_conc" = list(type = "num dbl", units = "%", options = NULL, descr = "A concentration of lipids in a sample, promille?"),
"g.scfa" = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of short-chain fatty acids (C2:C6)"),
"g.mcfa" = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of medium-chain fatty acids (C6:C12)"),
"g.lcfa" = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of long-chain fatty acids (>C12)"),
"g.uns_fa" = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of unsaturated fatty acids"),
"g.diacids" = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of dicarboxylic acids: oxidized fatty acids with two carboxyl groups"),
"g.alkanes" = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of saturated hydrocarbons"),
"g.p_s" = list(type = "num dbl", units = NULL, options = NULL, descr = "Pristane/Phytane ratio ??"),
"g.o_s" = list(type = "num dbl", units = NULL, options = NULL, descr = "Odd-over-even carbon number predominance ??"),
"g.aapac18e_h" = list(type = "num dbl", units = NULL, options = NULL, descr = "An index related to C18 alkylphenanthrene homologs (E/H variants) ??"),
"g.srr" = list(type = "num dbl", units = NULL, options = NULL, descr = "A sterane ratio expressed as a percentage ??"),
"g.group" = list(type = "txt ord", units = NULL, options = c("plant", "animal", "mixture"), descr = "A classification for organics source based on expert analysis"),
"g.conf" = list(type = "num dbl", units = NULL, options = NULL, descr = "A confidence level for organic source classification (g.group), 1 means dead certainty"),
"g.plant_count" = list(type = "num int", units = NULL, options = NULL, descr = "A rowwise count for 'plant'"),
"g.tree_count" = list(type = "num int", units = NULL, options = NULL, descr = "A rowwise count for 'tree'"),
"g.cereal" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'cereal' was mention in the comments"),
"g.fruit" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'fruit' was mention in the comments"),
"g.vegetable" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'vegetable' was mention in the comments"),
"g.resin" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'resin' was mention in the comments"),
"g.fish" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'fish' was mention in the comments"),
"g.complex_mxtr" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'complex mixture' was mention in the comments"),
"p.optical_activity" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if optical activity is 'Active'"),
"p.mica" = list(type = "txt ord", units = NULL, options = NULL, descr = "The relative abundance of mica minerals"),
"p.group" = list(type = "txt fct", units = NULL, options = NULL, descr = "A classification for petrography based on expert analysis"),
"p.lower_fr_bound" = list(type = "num dbl", units = "mm", options = NULL, descr = "A lower bound for rock fragments approximate size range"),
"p.upper_fr_bound" = list(type = "num dbl", units = NULL, options = NULL, descr = "An upper bound for rock fragments approximate size range"),
"p.lower_roundness_bound" = list(type = "txt ord", units = NULL, options = c("angular", "subangular", "subround", "round"), descr = "A lower bound for rock roundndess"),
"p.upper_roundness_bound" = list(type = "txt ord", units = NULL, options = c("angular", "subangular", "subround", "round"), descr = "An upper bound for rock roundndess"),
"p.granite" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'granite' was mentioned in the comments"),
"p.granodiorite" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'granodiorite' was mentioned in the comments"),
"p.diorite" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'diorite' was mentioned in the comments"),
"p.sandstone" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'sandstone' was mentioned in the comments"),
"p.mudstone" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'mudstone' was mentioned in the comments"),
"p.limestone" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'limestone' was mentioned in the comments"),
"p.gritstone" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'gritstone' was mentioned in the comments"),
"p.volcanic" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'volcanic' was mentioned in the comments"),
"p.andesite" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'andesite' was mentioned in the comments"),
"p.microgranite" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'microgranite' was mentioned in the comments"),
"p.ksp" = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'ksp' was mentioned in the comments")
)
# GCMS var description
gcms_spectra_var_info <- list(
"seqNum" = list(units = NULL, descr = "Unique sequence number (ID)"),
"acquisitionNum" = list(units = NULL, descr = "Unique sequence number (ID)"),
"msLevel" = list(units = NULL, descr = "The level of mass spectrometry data: level 1 meaning only one stage of mass spectrometry was performed (no MS/MS fragmentation)"),
"polarity" = list(units = NULL, descr = "The value -1 indicates the scans were recorded in negative ion mode"),
"peaksCount" = list(units = NULL, descr = "How many peaks (detected ions) were recorded in each scan"),
"totIonCurrent" = list(units = NULL, descr = "The total ion current, representing the overall signal strength"),
"retentionTime" = list(units = "s", descr = "The time taken for a solute to pass through a chromatography column"),
"basePeakMZ" = list(units = NULL, descr = "The mass-to-charge (m/z) ratio of the most intense peak"),
"basePeakIntensity" = list(units = NULL, descr = "The intensity of the most intense peak"),
"centroided" = list(units = NULL, descr = "TRUE indicates that the data points have already been processed to represent peak centers")
)
Library loading function:
load_pkg <- function(pkg) {
# Input validation
if (!is.character(pkg) || length(pkg) != 1) {
stop("Parameter 'pkg' must be a single character string.")
}
# Check if the package is installed; if not, attempt installation.
if (!requireNamespace(pkg, quietly = TRUE)) {
tryCatch({
message("Installing ", pkg)
install.packages(pkg, dependencies = TRUE, repos = "https://cloud.r-project.org", quiet = TRUE)
}, error = function(e) {
stop(sprintf("Installation failed for package '%s': %s", pkg, e$message))
})
}
# Attempt to load the package.
if (!paste0("package:", pkg) %in% search()) {
tryCatch({
#message("Loading ", pkg)
suppressPackageStartupMessages(library(pkg, character.only = TRUE))
}, error = function(e) {
stop(sprintf("Loading failed for package '%s': %s", pkg, e$message))
})
}
invisible(pkg)
}
Plot saving function:
plot_saver <- function(object, path, name, width, height, dpi) {
source("load_pckgs.r")
dependencies <- c("ggplot2", "knitr")
full_path <- file.path(path, name)
# Save the plot using ggsave
ggsave(
filename = full_path,
plot = object,
width = width,
height = height,
dpi = dpi
)
# Generate HTML code for clickable full-width image
html_code <- paste0(
'<a href="', full_path,
'" data-lightbox="gallery">',
'<img src="', full_path,
'" class="full-width" alt="Plot image" />',
'</a>'
)
knitr::asis_output(html_code)
}
Analysis
Carbonization Pattern Plotter
The function loads the raw data from
info_and_carb_data_file_name (parameter set in
settings.r) and presents mean carbonisation patterns per
each vessel part using in several modes:
"by_site_and_period", "by_site", or
"by_period".
Carbonization Pattern Plotter Function:
carb_plotter <- function(mode = "by_site_and_period", gap = 250) {
modes <- c("by_site_and_period", "by_site", "by_period")
# loadings
source("plot_carb_pattern.r")
source("settings.r")
source("load_pckgs.r")
source("plot_saver.r")
dependencies <- c("readxl", "dplyr", "tidyr", "ggplot2", "grid", "gridExtra", "purrr", "sysfonts", "showtext", "knitr")
invisible(lapply(dependencies, load_pkg))
sysfonts::font_add_google("Open Sans", "opensans")
showtext::showtext_auto()
grDevices::windowsFonts(opensans = windowsFont("Open Sans"))
data <- readxl::read_excel(file.path(inpath, info_and_carb_data_file_name), na = "")
# filtering data
carb <- data %>%
dplyr::select(
record_id,
site_name, period,
dplyr::starts_with("col_")
) %>%
tidyr::pivot_longer(
cols = dplyr::starts_with("col_"),
names_to = "part",
values_to = "state"
) %>%
dplyr::mutate(
side = ifelse(grepl("col_in_", part), "IN", "OUT"),
part = gsub("col_(in|out)_", "", part)
) %>%
dplyr::filter(
state %in% c("carb", "clear")
)
# drawing a pot perimeter
x <- c(0, 373, 375, 368, 344,
322, 316, 342, 368, 384,
397, 399, 395, 384, 366,
346, 289, 238, 179, 128,
62, 0)
y <- -c(0, 0, 17, 35, 53,
77, 99, 152, 234, 357,
514, 630, 785, 856, 873,
867, 818, 765, 715, 679,
657, 648)
perimeter_IN <- data.frame( x = x, y = y )
perimeter_OUT <- data.frame( x = -x-gap*2, y = y )
# setting coloration zones
lip <- rbind( perimeter_IN[1:4,], c(0, perimeter_IN[4,2]) )
neck <- rbind( perimeter_IN[4:7,], c(0, perimeter_IN[7,2]), c(0, perimeter_IN[4,2]) )
shoulder <- rbind( perimeter_IN[7:9,], c(0, perimeter_IN[9,2]), c(0, perimeter_IN[7,2]) )
body <- rbind( perimeter_IN[9:12,], c(0, perimeter_IN[12,2]), c(0, perimeter_IN[9,2]) )
foot <- rbind( perimeter_IN[12:19,], c(perimeter_IN[19,1], perimeter_IN[12,2]) )
crotch <- rbind( perimeter_IN[19:22,], c(perimeter_IN[22,1], perimeter_IN[12,2]), c(perimeter_IN[19,1], perimeter_IN[12,2]) )
polygons_OUT <- list(
lip = lip,
neck = neck,
shoulder = shoulder,
body = body,
foot = foot,
crotch = crotch
)
mirror <- function(df) {
df$x <- df$x *(-1)-gap*2
return(df)
}
polygons_IN <- lapply(polygons_OUT, mirror)
polygons <- list(inside = polygons_IN, outside = polygons_OUT)
# text labels
labels <- data.frame(
x = c(
rep(-gap, 6),
mean( c(-x[15], -x[22]) - gap*2 ),
mean( c(x[15], x[22]) )
),
y = c(
mean(c(y[2], y[4])),
mean(c(y[4], y[7])),
mean(c(y[7], y[9])),
mean(c(y[9], y[12])),
mean(c(y[15], y[19])),
y[22],
rep(y[1]+0.1*abs(abs(y[1])-abs(y[15])), 2)
),
label = c( parts, "INSIDE", "OUTSIDE" )
)
# run carb plotter
if (!mode %in% modes) {
stop(sprintf("%s is not an option. Available cases:\n%s\n", mode, paste(modes, collapse = ", ")))
} else {
if (mode == "by_site_and_period") {
site_by_period <- carb %>% dplyr::select(period, site_name) %>% dplyr::distinct()
plots <- site_by_period %>%
purrr::pmap(function(period, site_name) {
tryCatch({
plot_carb_pattern(
df = carb,
poly = polygons,
site = site_name,
period = period,
x = x,
y = y,
perimeter_IN = perimeter_IN,
perimeter_OUT = perimeter_OUT,
labels = labels,
cex = cex)
}, error = function(e) {
message(sprintf("Error for site '%s' and period '%s': %s", site_name, period, e$message))
return(NULL)
})
})
} else if (mode == "by_site") {
sites <- carb %>% dplyr::select(site_name) %>% dplyr::distinct()
plots <- sites %>%
purrr::pmap(function( site_name ) {
tryCatch({
plot_carb_pattern(
df = carb,
poly = polygons,
site = site_name,
period = period,
x = x,
y = y,
perimeter_IN = perimeter_IN,
perimeter_OUT = perimeter_OUT,
labels = labels,
cex = cex)
}, error = function(e) {
message(sprintf("Error for site '%s': %s", site_name, e$message))
return(NULL)
})
})
} else {
periods <- carb %>% dplyr::select(period) %>% dplyr::distinct()
plots <- periods %>%
purrr::pmap(function( period ) {
tryCatch({
plot_carb_pattern(
df = carb,
poly = polygons,
site = site_name,
period = period,
x = x,
y = y,
perimeter_IN = perimeter_IN,
perimeter_OUT = perimeter_OUT,
labels = labels,
cex = cex)
}, error = function(e) {
message(sprintf("Error for period '%s': %s", period, e$message))
return(NULL)
})
})
}
}
# Cook the plots
#plots <- flatten(plots)
g <- gridExtra::arrangeGrob(grobs = plots, ncol = 2, nrow = 2)
label_grob <- textGrob(
"% of carbonized (total clear and carb). Darker color means larger percentage of carbonized sherds;",
x = 0.65, y = 0.02,
just = c("right", "bottom"),
gp = gpar(fontsize = 12)
)
na_grob <- textGrob(
" cross-lining means NA ",
x = 0.65, y = 0.02,
just = c("left", "bottom"),
gp = gpar(fontsize = 12, col = "red")
)
final_grob <- gridExtra::arrangeGrob(
g, label_grob, na_grob,
ncol = 1,
heights = unit(c(0.9, 0.05, 0.05), "npc")
)
results <- list(data = NULL, plot = final_grob)
return(results)
}
Carbonization Pattern Plotter Auxillary Function:
plot_carb_pattern <- function(df, poly, site = NULL, period = NULL,
x, y, perimeter_IN, perimeter_OUT, labels,
cex = 10, gap = 250, na_col = "#b8241a") {
# loadings
period_colors <- list( "Shang" = "blue", "Zhou" = "aquamarine" ) # color settings of periods
source("col_by_sides.r")
source("load_pckgs.r")
dependencies <- c("dplyr", "tidyr", "ggplot2", "ggpattern")
invisible(lapply(dependencies, load_pkg))
if (!is.null(site) && !is.null(period)) {
if (nrow(df %>% dplyr::filter(site_name == {{site}} & period == {{period}})) == 0) {
plot <- ggplot2::ggplot()
return(plot)
stop("No site, no period")
} else {
counts <- df %>%
dplyr::filter(site_name == !!site & period == !!period) %>%
dplyr::group_by(site_name, period, side, part, state) %>%
dplyr::summarise(count = n(), .groups = "drop") %>%
tidyr::pivot_wider(names_from = state, values_from = count, values_fill = list(count = 0)) %>%
dplyr::mutate(
total = carb + clear,
prop_carb = round(100*carb / total, 2),
prop_clear = round(100*clear / total, 2),
color = round(255 * (1 - prop_carb / 100), 0),
hex = sprintf("#%02x%02x%02x", color, color, color),
part = factor(part, levels = parts),
period = factor(period, levels = c("Shang", "Zhou"))
) %>%
dplyr::arrange(period, part)
title_text <- sprintf(
"%s, <span style='color:%s;'>%s</span> period",
site,
period_colors[[period]],
period
)
}
} else if (!is.null(site) && is.null(period)) {
counts <- df %>%
dplyr::select( -period ) %>%
dplyr::filter(site_name == !!site) %>%
dplyr::group_by(site_name, side, part, state) %>%
dplyr::summarise(count = n(), .groups = "drop") %>%
tidyr::pivot_wider(
names_from = state,
values_from = count,
values_fill = list(count = 0)
) %>%
dplyr::mutate(
total = carb + clear,
prop_carb = round(100*carb / total, 2),
prop_clear = round(100*clear / total, 2),
color = round(255 * (1 - prop_carb / 100), 0),
hex = sprintf("#%02x%02x%02x", color, color, color),
part = factor(part, levels = parts)
) %>%
dplyr::arrange(site_name, part)
title_text <- sprintf("%s, any period", site)
} else if (is.null(site) && !is.null(period)) {
counts <- df %>%
dplyr::select( -site_name ) %>%
dplyr::filter( period == !!period ) %>%
dplyr::group_by( period, side, part, state ) %>%
dplyr::summarise( count = n(), .groups = "drop" ) %>%
tidyr::pivot_wider(names_from = state, values_from = count, values_fill = list(count = 0)) %>%
dplyr::mutate(
total = carb + clear,
prop_carb = round(100*carb / total, 2),
prop_clear = round(100*clear / total, 2),
color = round(255 * (1 - prop_carb / 100), 0),
hex = sprintf("#%02x%02x%02x", color, color, color),
part = factor(part, levels = parts),
period = factor(period, levels = c("Shang", "Zhou"))
) %>%
dplyr::arrange(period, part)
title_text <- sprintf(
"All sites, <span style='color:%s;'>%s</span> period",
period_colors[[period]],
period
)
}
counts <- counts %>%
dplyr::select(side, part, hex, carb, clear) %>%
dplyr::mutate(text = paste0(round(100*carb/(carb+clear)),"% (", carb + clear, ")")) %>%
dplyr::arrange(side, part)
color_in <- col_by_sides(counts, "IN")
color_out <- col_by_sides(counts, "OUT")
colors <- list(inside = color_in, outside = color_out)
numbers <- data.frame(
number = c(colors$inside$text, colors$outside$text),
x = c(
-rep(( 1.9*gap ), 4),
-x[17]-1.9*gap,
-1.9*gap,
-rep(( 0.1*gap ), 4),
x[17]-0.2*gap,
-0.1*gap
),
y = rep(c(
mean(c(y[2], y[4])),
mean(c(y[4], y[7])),
mean(c(y[7], y[9])),
mean(c(y[9], y[12])),
y[17],
y[22]
), 2
)
)
p <- ggplot2::ggplot()
for (i in seq_along(poly$inside)) {
poly_data <- poly$inside[[i]]
poly_color <- colors$inside$hex[i]
if (poly_color == na_col) {
p <- p + ggpattern::geom_polygon_pattern(
data = poly_data, ggplot2::aes(x = x, y = y, pattern = "diagonal"),
pattern_fill = "transparent",
pattern_density = 1,
pattern_angle = 45,
fill = "transparent",
color = "black",
show.legend = FALSE
)
} else {
p <- p + ggplot2::geom_polygon(
data = poly_data, ggplot2::aes(x = x, y = y),
fill = poly_color,
color = "black"
)
}
}
for (i in seq_along(poly$outside)) {
poly_data <- poly$outside[[i]]
poly_color <- colors$outside$hex[i]
if (poly_color == na_col) {
p <- p + ggpattern::geom_polygon_pattern(
data = poly_data,
ggplot2::aes(x = x, y = y, pattern = "diagonal"),
pattern_fill = "transparent",
pattern_density = 1,
pattern_angle = 45,
fill = "transparent",
color = "black",
show.legend = FALSE
)
} else {
p <- p + ggplot2::geom_polygon(
data = poly_data, ggplot2::aes(x = x, y = y),
fill = poly_color,
color = "black"
)
}
}
plot <-p +
ggplot2::coord_fixed(ratio = 1) +
ggplot2::geom_path(
data = perimeter_OUT,
ggplot2::aes(x = x, y = y),
linewidth = 1
) +
ggplot2::geom_path(
data = perimeter_IN,
ggplot2::aes(x = x, y = y),
linewidth = 1
) +
ggplot2::labs(x = NULL, y = NULL, title = title_text) +
ggplot2::geom_segment(
ggplot2::aes(
x = 0,
y = y[1]+0.1*abs(abs(y[1])-abs(y[22])),
xend = 0,
yend = y[22]-0.1*abs(abs(y[1])-abs(y[22]))
),
linetype = "dashed",
linewidth = 1.5,
lineend = "round"
) +
ggplot2::geom_segment(
ggplot2::aes(
x = -gap*2,
y = y[1]+0.1*abs(abs(y[1])-abs(y[22])),
xend = -gap*2,
yend = y[22]-0.1*abs(abs(y[1])-abs(y[22]))
),
linetype = "dashed",
linewidth = 1.5,
lineend = "round"
) +
ggplot2::geom_text(
data = labels[7:8,],
ggplot2::aes(x = x, y = y, label = label),
size = 0.5*cex
) +
ggplot2:: geom_text(
data = numbers[1:6,],
ggplot2::aes(x = x, y = y, label = number),
size=0.4*cex,
hjust = 0,
nudge_x = -0.2
) +
ggplot2::geom_text(
data = numbers[7:12,],
ggplot2::aes(x = x, y = y, label = number),
size=0.4*cex,
hjust = 1,
nudge_x = 0.2
) +
ggplot2::xlim(-900, 400) +
hrbrthemes::theme_ipsum(base_family = "opensans") +
ggplot2::theme(
plot.title = ggtext::element_markdown(size = 2*cex, face = "bold", hjust = 0.5),
panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank(),
axis.text.y = ggplot2::element_blank()
)
return(plot)
}
source("carbPlotter.r")
carb <- carb_plotter(mode = "by_site_and_period", gap = 250)
plot_saver(carb$plot, outpath, "carb_pattern.png", 13.3, 10.7, 300)
Dry or Wet Summaries
The function loads the raw data from
info_and_carb_data_file_name (parameter set in
settings.r) and performs calculations for
dry/wet classification based on the conditions defined in
dry_wet_conditions (set in settings.r).
Dry or Wet Function:
dryOrWet <- function(mode) {
# loadings & settings
source("buildCondition.r")
source("load_pckgs.r")
source("settings.r")
source("plot_saver.r")
dependencies <- c("readxl", "dplyr", "tidyr", "ggplot2", "rlang", "hrbrthemes", "ggtext", "ggpubr", "colorspace", "patchwork", "knitr")
invisible(lapply(dependencies, load_pkg))
grDevices::windowsFonts(opensans = windowsFont("Open Sans"))
df <- readxl::read_excel(file.path(inpath, info_and_carb_data_file_name), na = "")
group_vars <- switch(
mode,
"by_site_and_period" = c("site_name", "period"),
"by_site" = "site_name",
"by_period" = "period"
)
# case_when_string <- paste( "case_when(", buildCondition(dry_wet_conditions), ")" ) # with feet
case_when_string <- paste( "case_when(", buildCondition(dry_wet_conditions[-8,]), ")" ) # without feet
dry_or_wet <- df %>%
dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>%
dplyr::mutate(
state = eval(parse_expr(case_when_string))
)
summary <- dry_or_wet %>%
dplyr::summarize(
.groups = "drop",
DRY = round(sum(state == -3) + 0.75*sum(state == -2) + 0.5*sum(state == -1), 0),
WET = round(sum(state == 3) + 0.75*2*sum(state == 2) + 0.5*sum(state == 1), 0),
na = sum(state == 0 ),
total = sum(!is.na(state)),
ident_rate = paste(round(100*(sum(!is.na(state)) - sum( state == 0 ))/sum(!is.na(state)), 0), "%"),
dry3 = sum(state == -3),
dry2 = sum(state == -2),
dry1 = sum(state == -1),
wet3 = sum(state == 3),
wet2 = sum(state == 2),
wet1 = sum(state == 1)
)
plot_data <- summary %>%
tidyr::pivot_longer(
cols = c(DRY, WET),
names_to = "condition",
values_to = "value"
) %>%
dplyr::group_by(dplyr::across(dplyr::all_of(group_vars)), condition) %>%
dplyr::summarise(total = sum(value), .groups = 'drop') %>%
dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>%
dplyr::mutate(percentage = total / sum(total) * 100)
if(mode != "by_site") {
plot_data <- plot_data %>%
dplyr::mutate(period = factor(period, levels = c("Shang", "Zhou")))
}
facet_formula <- paste( "~", paste(group_vars, collapse = " + " ))
piecharts <-
ggplot2::ggplot(
plot_data,
ggplot2::aes(y = percentage, fill = condition)
) +
ggplot2::geom_bar(
ggplot2::aes(x = factor(1)),
stat = "identity",
width = 1,
colour = "white",
linewidth = 1
) +
ggplot2::geom_text(
ggplot2::aes(x = factor(1), label = paste0(round(percentage), "%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "white"
) +
ggplot2::coord_polar("y", start = 0) +
ggplot2::facet_wrap(as.formula(facet_formula), ncol = 2) +
ggplot2::labs(
x = NULL,
y = NULL,
fill = "Condition",
title = sprintf("DRY versus WET by %s", paste(group_vars, collapse = " and "))
) +
ggplot2::scale_fill_manual(values = drywetColors) +
hrbrthemes::theme_ipsum(base_family = "opensans") +
ggplot2::theme(
panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank(),
axis.text.y = ggplot2::element_blank(),
axis.title.x = ggplot2::element_blank(),
axis.title.y = ggplot2::element_blank(),
strip.text = ggplot2::element_text(size = 15, face = "bold", hjust = 0.5),
legend.position = "bottom",
panel.spacing = ggplot2::unit(2, "lines"),
plot.title = ggtext::element_markdown(size = 20, face = "bold", hjust = 0.5)
)
stat_data <- summary %>%
dplyr::mutate(
group = do.call(paste, c(dplyr::select(., dplyr::all_of(group_vars)), sep = ", "))
) %>%
dplyr::select(group, DRY, WET)
data_matrix <- as.matrix(stat_data[, c("DRY", "WET")])
rownames(data_matrix) <- stat_data$group
ftest <- fisher.test(data_matrix, alternative = "two.sided")
pvalue <- format(ftest$p.value, scientific = FALSE, digits = 3)
conditions <- dry_wet_conditions %>%
dplyr::select(header, help, value)
dryCOLindex <- which(names(summary) == "DRY")
wetCOLindex <- which(names(summary) == "WET")
table_sum <-
ggpubr::ggtexttable(
summary,
rows = NULL,
theme = ggpubr::ttheme(
"light",
padding = ggplot2::unit(c(1.5, 6), "mm")
)
) %>%
ggpubr::table_cell_bg(
row = 2:(nrow(summary) + 1),
column = wetCOLindex,
fill = colorspace::lighten(unname(drywetColors["WET"]), amount = 0.3),
color = "white"
) %>%
ggpubr::table_cell_bg(
row = 2:(nrow(summary) + 1),
column = dryCOLindex,
fill = colorspace::lighten(unname(drywetColors["DRY"]), amount = 0.2),
color = "white"
)
table_cond <-
ggpubr::ggtexttable(
conditions,
rows = NULL,
theme = ggpubr::ttheme(
"light",
padding = ggplot2::unit(c(4, 6), "mm")
)
) %>%
ggpubr::table_cell_bg(
row = 2:4,
column = 1,
fill=lighten(unname(drywetColors["DRY"]), amount = 0.2),
color = "white"
) %>%
ggpubr::table_cell_bg(
row = 5:9,
column = 1,
fill=lighten(unname(drywetColors["WET"]), amount = 0.3),
color = "white"
)
left_col <- (table_cond + theme(plot.margin = unit(c(0,0,100,0), "pt"))) / table_sum
combined_plot <- left_col | piecharts
results <- list (data = dry_or_wet, plot = combined_plot)
return(results)
}
source("dryOrWet.r")
drywet <- dryOrWet("by_site_and_period")
plot_saver(drywet$plot, outpath, "dry_wet.png", 6, 5, 300)
Variables Overviewer
Here we bind all data we have from drywet,
petrography, and residue analyses, and plot
each variable.
For each variable name we added prefixes addressing their origin:
- i.* Information about the specimen
- m.* Measurements of the specimen
- c.* Carbonization coloration
analysis
- g.* GCMS data
- p.* Petrography analysis data
Variables Overviewer Function:
overviewer <- function(inpath, savepath, save = FALSE, talky = TRUE) {
### LOADINGS
## dependencies
source("settings.r")
source("load_pckgs.r")
source("parse_bounds.r")
source("sum_non_numeric.r")
source("string_processor.r")
source("dryOrWet.r")
dependencies <- c("readxl", "writexl", "dplyr", "tidyr", "stringr", "scales", "ggplot2", "Hmisc", "ggcorrplot", "grid", "gridExtra", "viridis")
invisible(lapply(dependencies, load_pkg))
## data
raw_df <- readxl::read_excel(file.path(inpath, info_and_carb_data_file_name), na = "")
raw_petro <- readxl::read_excel(file.path(inpath, petro_data_file_name), na = "")
raw_arch <- dryOrWet("by_site_and_period")$data %>% dplyr::ungroup()
raw_gcms <- readxl::read_excel(file.path(inpath, gcms_data_file_name), sheet = "Data", col_names = T, na = "")
#if (talky) {
# if (all(sapply(mget(c("raw_arch", "raw_gcms", "raw_petro")), function(x) nrow(x) > 0))) {
# message("✅ All tables were loaded and contain records.")
# } else {
# warning("❌ One or more tables are empty!")
# }
#}
## result list preparing
results <- list(
data = NULL,
numeric_data = NULL,
non_numeric_data = NULL,
numeric_plots = NULL,
non_numeric_plots = NULL,
merged_plots = NULL
)
### PROCESSING DATA
# raw_arch
arch <- raw_arch %>%
dplyr::ungroup() %>%
dplyr::rename(
site = site_name,
id = aux_id,
conf = state
) %>%
dplyr::mutate(
group = dplyr::case_when(
conf < 0 ~ "dry",
conf > 0 ~ "wet",
conf == 0 ~ NA
),
group = factor(group, levels = c("dry", "wet"), ordered = TRUE),
conf = dplyr::case_when(
abs(conf) == 3 ~ 1,
abs(conf) == 2 ~ 0.5,
abs(conf) == 1 ~ 0.3
),
site = factor(site, levels = c("Gaoqingcaopo", "Kanjiazhai", "Yangxinliwu"), ordered = FALSE),
period = factor(period, levels = c("Shang", "Zhou"), ordered = TRUE),
lip = !is.na(col_in_lip) | !is.na(col_out_lip),
neck = !is.na(col_in_neck) | !is.na(col_out_neck),
shoulder = !is.na(col_in_shoulder) | !is.na(col_out_shoulder),
body = !is.na(col_in_body) | !is.na(col_out_body),
foot = !is.na(col_in_foot) | !is.na(col_out_foot),
crotch = !is.na(col_out_crotch),
completeness = round(100 * (lip + neck + shoulder + body + foot + crotch) / 6),
dplyr::across(dplyr::starts_with("col_"), ~ factor(tolower(as.character(.)))),
collection = paste0(period, ".", substr(as.character(site), 1, 1))
) %>%
dplyr::mutate(
collection = factor(collection, levels = c("Shang.Y", "Shang.G", "Zhou.G", "Zhou.K"))
) %>%
dplyr::select(
-c(record_id, sample_number, part, mouth_present, foot_present, vis_alysis, notes, decoration, residue_sampled, petro_sampled, question_field)
) %>%
dplyr::rename_with(~ paste0("i.", .), c(collection, site, period, lip, neck, shoulder, body, foot, crotch, completeness, cord_mark_type)) %>%
dplyr::rename_with(~ paste0("m.", .), c(body_width, height, rim, orifice, cord_mark_width)) %>%
dplyr::rename_with(~ paste0("c.", .), c(dplyr::starts_with("col_"), group, conf))
# raw_petro
petro <- raw_petro %>%
dplyr::select(-Photo, -`Interesting Incs`, -Site, -Period, -`Dry/Wet`, -`Dry/Wet Simple`, -Group) %>%
dplyr::rename(
id = `Sample #`,
optical_activity = `Optical Activity`,
rock_frags = `Rock Frags`,
frag_size = `Frag Size`,
frag_roundness = `Frag Roundness`,
mica = Mica,
group = `Group Simplified`
) %>%
dplyr::mutate(
optical_activity = ifelse(stringr::str_starts(stringr::str_to_lower(optical_activity), "A"), TRUE, FALSE),
frag_size = stringr::str_replace_all(frag_size, "[^0-9.<>=+-]", ""),
frag_roundness = stringr::str_to_lower(frag_roundness)
)
frag_size_clean <- parse_bounds(petro$frag_size) %>%
dplyr::rename(
lower_fr_bound = lower,
upper_fr_bound = upper,
frag_size = original
)
frag_shape_clean <- dplyr::tibble(
original = c("and-subang", "ang-subang", "ang-subrnd", "and-subrnd", "subang-subrnd", "subang-rnd", "ang-rnd", "rnd"),
lower_roundness_bound = c("angular", "angular", "angular", "angular", "subangular", "subangular", "angular", "round"),
upper_roundness_bound = c("subangular", "subangular", "subround", "subround", "subround", "round", "round", "round")
)
rock_dictionary <- petro %>%
dplyr::filter(!is.na(rock_frags)) %>%
dplyr::mutate(rock_frags = stringr::str_replace_all(rock_frags, " and ", ";")) %>%
tidyr::separate_rows(rock_frags, sep = ";|/|-") %>%
dplyr::mutate(rock_frags = stringr::str_trim(rock_frags)) %>%
dplyr::filter(rock_frags != "") %>%
dplyr::distinct(rock_frags)
petro <- dplyr::bind_cols(petro, frag_size_clean) %>%
dplyr::left_join(frag_shape_clean, by = c("frag_roundness" = "original")) %>%
dplyr::mutate(
lower_roundness_bound = factor(
lower_roundness_bound,
levels = c("angular", "subangular", "subround", "round"),
ordered = TRUE
),
upper_roundness_bound = factor(
upper_roundness_bound,
levels = c("angular", "subangular", "subround", "round"),
ordered = TRUE
),
mica = factor(
mica,
levels = c("None", "No", "Rare", "Few", "Moderate", "Yes"),
ordered = TRUE
),
group = factor(group),
granite = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("granit", ignore_case = TRUE)), TRUE, FALSE),
granodiorite = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("granodiorite", ignore_case = TRUE)), TRUE, FALSE),
diorite = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("diorite", ignore_case = TRUE)), TRUE, FALSE),
sandstone = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("sandstone", ignore_case = TRUE)), TRUE, FALSE),
mudstone = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("mudstone", ignore_case = TRUE)), TRUE, FALSE),
limestone = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("limestone", ignore_case = TRUE)), TRUE, FALSE),
gritstone = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("gritstone", ignore_case = TRUE)), TRUE, FALSE),
volcanic = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("volcanic", ignore_case = TRUE)), TRUE, FALSE),
andesite = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("andesite", ignore_case = TRUE)), TRUE, FALSE),
microgranite = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("microgranite", ignore_case = TRUE)), TRUE, FALSE),
ksp = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("ksp", ignore_case = TRUE)), TRUE, FALSE)
) %>%
dplyr::select( -frag_roundness, -rock_frags, - dplyr::starts_with("frag_size")) %>%
dplyr::rename_with(~ paste0("p.", .), -id)
# raw_gcms
chumpsy_vars <- c("aapa", "phy", "tmtd", "cholesterol_bi_products", "miliacin", "ergostanol", "dha", "lc_ketones", "pah", "bpca", "contaminants")
gcms <- raw_gcms %>%
dplyr::rename_with(tolower) %>%
dplyr::rowwise() %>%
dplyr::rename(
lipids_conc = ug_g,
group = preliminary_interpretation,
conf = level_of_confidence,
srr = `srr_%`
) %>%
dplyr::mutate(
id = sub("A$", "", id),
plant_count = sum(tolower(dplyr::c_across(dplyr::all_of(chumpsy_vars))) == 'p', na.rm = TRUE),
tree_count = sum(tolower(dplyr::c_across(dplyr::all_of(chumpsy_vars))) == 'tr', na.rm = TRUE),
cereal = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("cereal", ignore_case = TRUE)), TRUE, FALSE),
fruit = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("fruit", ignore_case = TRUE)), TRUE, FALSE),
vegetable = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("vegetab", ignore_case = TRUE)), TRUE, FALSE),
resin = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("resin", ignore_case = TRUE)), TRUE, FALSE),
fish = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("fish", ignore_case = TRUE)), TRUE, FALSE),
complex_mxtr = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("complex", ignore_case = TRUE)), TRUE, FALSE),
conf = dplyr::case_when(
conf == 1 ~ 1,
conf == 2 ~ 0.5,
conf == 3 ~ 0.3
),
group = dplyr::case_when(
group == 1 ~ "plant",
group == 2 ~ "mixture",
group == 3 ~ "animal",
),
group = factor(
group,
levels = c("plant", "mixture", "animal"),
ordered = TRUE
)
) %>%
dplyr::ungroup() %>%
dplyr::select(-dynastie, -dplyr::all_of(chumpsy_vars), -comment) %>%
dplyr::rename_with(~ paste0("g.", .), -id)
# All the tables are ready for joining nopw
info.variable_prefix <- c(
"ℹ️ Prefixes were added to each variable name addressing their function or derivation",
"\t i.* = Information about the specimen",
"\t m.* = Measurements of the specimen",
"\t c.* = Carbonization coloration analysis",
"\t g.* = GCMS data",
"\t p.* = Petrography analysis data"
)
#if (talky) {
# for (line in info.variable_prefix) {
# cat(line, "\n")
# }
#}
# Fusing
data <- arch %>%
dplyr::inner_join(gcms, by = "id") %>%
dplyr::inner_join(petro, by = "id")
if (nrow(data) == 0) {
stop("❌ data has no records!")
} else {
results$data <- data
}
if (talky) {
message("⚠️ inner joining was used to link the data, use left_join to conserve unmatched records")
intersected_ids <- length(Reduce(intersect, list(arch$id, gcms$id, petro$id)))
cat("Petrography:", nrow(petro), "records\n")
cat("GCMS:", nrow(gcms), "records\n")
cat("Carbonisation:", nrow(arch), "records\n\n")
cat("Max number of intersected id-s:", intersected_ids, "\n")
cat("Number of records in the final table:", nrow(data), "\n")
#dplyr::glimpse(data)
cat("\n\n📦 Variable Descriptions:\n\n")
for (var in names(var_info)) {
info <- var_info[[var]]
cat(sprintf("📌 %s:\n", var))
cat(sprintf(" 🔧 Type: %s", info$type))
if (!is.null(info$units)) cat(sprintf(" | 📏 Units: %s", info$units))
cat("\n")
cat(sprintf(" 📝 Description: %s\n", info$descr))
if (var %in% names(data)) {
values <- data[[var]]
options <- if (is.numeric(values)) {
paste("from", round(min(values, na.rm = TRUE), 1), "to", round(max(values, na.rm = TRUE), 1))
} else {
vals <- unique(values)
lvl <- levels(values)
ifelse(
is.null(lvl),
paste(vals, collapse = ", "),
paste(lvl, collapse = ", ")
)
}
cat(sprintf(" 🔢 Values: %s\n", options))
} else {
cat(" ⚠️ Variable not found in 'data'\n")
}
cat("\n")
}
}
# ANALYSIS
# Overview plotting
# Select numeric data
numeric_data <- data %>%
dplyr::select(dplyr::where(is.numeric), i.collection) %>%
dplyr::select(-i.completeness) %>%
tidyr::pivot_longer(
cols = -i.collection,
names_to = "variable",
values_to = "value"
) %>%
dplyr::rename(group = i.collection) %>%
dplyr::filter(!variable %in% excluded_vars)
# Create boxplot for each numeric variable
numeric_plots <- list()
for (var in unique(numeric_data$variable)) {
# fetch the var data
df_var <- numeric_data %>% dplyr::filter(variable == var) %>% dplyr::filter(is.finite(value))
info <- var_info[[var]]
# count sample sizes
counts <- df_var %>%
dplyr::group_by(group) %>%
dplyr::summarize(
n = sum(!is.na(value)),
y_min = ifelse(all(is.na(value)), NA, min(value, na.rm = TRUE)),
y_max = ifelse(all(is.na(value)), NA, max(value, na.rm = TRUE)),
.groups = "drop"
) %>%
dplyr::mutate( # If all values фку identical or NA, offset to 0
range = ifelse(!is.na(y_min) & y_min != y_max, y_max - y_min, 0),
label_y = y_min - 0.05 * range # 5% below the min
)
# Boxplot the variable
p <- ggplot2::ggplot(df_var, ggplot2::aes(x = group, y = value)) +
ggplot2::geom_boxplot() +
ggplot2::geom_text(
data = counts,
aes(x = group, y = label_y, label = paste("n =", n)),
vjust = 1,
size = 3
) +
ggplot2::labs(
title = var,
subtitle = stringr::str_wrap(info$descr, width = 60),
x = NULL,
y = "Value"
) +
ggplot2::theme_minimal() +
coord_cartesian(clip = "off") +
theme(
axis.text = element_text(size = 6),
axis.title = element_text(size = 6),
plot.title = element_text(size = 10),
plot.subtitle = element_text(size = 6),
plot.margin = unit(c(5, 5, 20, 5), "pt")
)
# save the plot to the list
numeric_plots[[var]] <- p
}
# Summarise non-numeric variables
sum_tables <- sum_non_numeric(data, "i.completeness")
non_numeric_plots <- list()
for (var in names(sum_tables)) {
# fetch the var data
df <- sum_tables[[var]] %>%
tidyr::pivot_longer(
cols = -group,
names_to = "Option",
values_to = "Count"
)
info <- var_info[[var]]
# restore the factor levels
if (var %in% names(data) && is.factor(data[[var]])) {
desired_levels <- levels(data[[var]])
df$Option <- factor(df$Option, levels = desired_levels)
} else {
df$Option <- factor(df$Option, levels = sort(unique(df$Option)))
}
# create a grouped bar plot
p <- ggplot2::ggplot(
df,
ggplot2::aes(x = group, y = Count, fill = Option)) +
ggplot2::geom_bar(stat = "identity", position = ggplot2::position_dodge()) +
ggplot2::labs(
title = var,
subtitle = stringr::str_wrap(info$descr, width = 60),
x = NULL,
y = "Count"
) +
ggplot2::scale_fill_viridis_d() +
ggplot2::theme_minimal() +
ggplot2::theme(
axis.text = ggplot2::element_text(size = 6),
axis.title = ggplot2::element_text(size = 6),
plot.title = ggplot2::element_text(size = 10),
plot.subtitle = ggplot2::element_text(size = 6),
axis.title.x = ggplot2::element_blank(),
legend.text = ggplot2::element_text(size = 8),
legend.title = ggplot2::element_text(size = 8)
)
non_numeric_plots[[var]] <- p
}
# Save the results
results$numeric_data <- numeric_data
results$non_numeric_data <- sum_tables
results$numeric_plots <- numeric_plots
results$non_numeric_plots <- non_numeric_plots
# Combine plots by prefix
group_plots <- non_numeric_plots[ grepl("group", names(non_numeric_plots)) ]
other_plots <- non_numeric_plots[ !grepl("group", names(non_numeric_plots)) ]
non_group_plots <- c(numeric_plots, other_plots)
get_prefix <- function(name) sub("\\..*", "", name)
plot_names <- names(non_group_plots)
prefixes <- unique(get_prefix(plot_names))
excluded_prefixes <- NULL
prefixes <- setdiff(prefixes, excluded_prefixes)
merged_plots <- list()
if (!dir.exists(file.path(savepath, "overview"))) {
dir.create(file.path(savepath, "overview"), recursive = TRUE)
}
for (prefix in prefixes) {
plots_for_prefix <- non_group_plots[ sapply(plot_names, function(x) get_prefix(x) == prefix) ]
file_name <- file.path(savepath, "overview", paste0(prefix, "_plots.png"))
merged_plot <- gridExtra::arrangeGrob(grobs = plots_for_prefix, ncol = 4)
#tiff(filename = file_name, width = 30, height = 20, units = "cm", res = 300)
png(filename = file_name, width = 30, height = 20, units = "cm", res = 300)
grid::grid.draw(merged_plot)
dev.off()
merged_plots[[prefix]] <- merged_plot
}
if (length(group_plots) > 0) {
file_name <- file.path(savepath, "overview", "group_plots.png")
merged_group_plot <- gridExtra::arrangeGrob(grobs = group_plots, ncol = length(group_plots))
#tiff(filename = file_name, width = 30, height = 10, units = "cm", res = 300)
png(filename = file_name, width = 30, height = 10, units = "cm", res = 300)
grid::grid.draw(merged_group_plot)
dev.off()
}
results$merged_plots <- merged_plots
#if (talky) cat("💾 Files have been saved to", outpath)
return(results)
}
Variables description & data inspection
source("overviewer.r")
r <- overviewer(inpath, outpath, talky = T)
## Petrography: 103 records
## GCMS: 105 records
## Carbonisation: 371 records
##
## Max number of intersected id-s: 103
## Number of records in the final table: 103
##
##
## 📦 Variable Descriptions:
##
## 📌 id:
## 🔧 Type: text id
## 📝 Description: Unique sample id
## 🔢 Values: Y15, Y19, Y20, Y26, Y28, Y29, Y33, Y34, Y40, Y49, Y50, Y51, Y52, Y62, Y78, Y83, Y84, Y85, Y91, Y117, Y119, Y134, Y151, Y152, Y156, Y169, Y172, Y179, Y215, Y217, G4, G5, G9, G10, G19, G20, G21, G22, G27, G28, G30, G34, G35, G39, G41, G42, G47, G48, G49, G55, G56, G60, G61, G62, G63, G66, G67, G70, G71, G73, G75, G80, G81, G87, G88, G90, G91, G93, G95, G96, G97, G98, G99, G104, K6, K7, K8, K9, K10, K11, K12, K13, K14, K15, K16, K17, K18, K19, K20, K21, K22, K23, K24, K25, K26, K27, K28, K29, K30, K31, K32, K33, K34
##
## 📌 i.site:
## 🔧 Type: txt fct
## 📝 Description: The name of the site
## 🔢 Values: Gaoqingcaopo, Kanjiazhai, Yangxinliwu
##
## 📌 i.period:
## 🔧 Type: txt ord
## 📝 Description: The dynastie
## 🔢 Values: Shang, Zhou
##
## 📌 i.lip:
## 🔧 Type: lgl
## 📝 Description: TRUE if lip is present in a sample
## 🔢 Values: TRUE, FALSE
##
## 📌 i.neck:
## 🔧 Type: lgl
## 📝 Description: TRUE if neck is present in a sample
## 🔢 Values: TRUE, FALSE
##
## 📌 i.shoulder:
## 🔧 Type: lgl
## 📝 Description: TRUE if shoulder is present in a sample
## 🔢 Values: TRUE, FALSE
##
## 📌 i.body:
## 🔧 Type: lgl
## 📝 Description: TRUE if body is present in a sample
## 🔢 Values: TRUE, FALSE
##
## 📌 i.foot:
## 🔧 Type: lgl
## 📝 Description: TRUE if foot is present in a sample
## 🔢 Values: FALSE, TRUE
##
## 📌 i.crotch:
## 🔧 Type: lgl
## 📝 Description: TRUE if crotch is present in a sample
## 🔢 Values: FALSE, TRUE
##
## 📌 i.completeness:
## 🔧 Type: num dbl | 📏 Units: %%
## 📝 Description: 100 means all parts are present in a sample, less means a fragment is missing some parts
## 🔢 Values: from 33 to 100
##
## 📌 i.collection:
## 🔧 Type: txt fct
## 📝 Description: A combination of period and a first letter of the site
## 🔢 Values: Shang.Y, Shang.G, Zhou.G, Zhou.K
##
## 📌 m.body_width:
## 🔧 Type: num dbl | 📏 Units: mm
## 📝 Description: Width of the body
## 🔢 Values: from 171 to 171
##
## 📌 m.height:
## 🔧 Type: num dbl | 📏 Units: mm
## 📝 Description: Height of the vessel fragment
## 🔢 Values: from 11.5 to 15
##
## 📌 m.rim:
## 🔧 Type: num dbl | 📏 Units: mm
## 📝 Description: The size of the rim
## 🔢 Values: from 15 to 57
##
## 📌 m.orifice:
## 🔧 Type: num dbl | 📏 Units: mm
## 📝 Description: The size of the orifice
## 🔢 Values: from 10 to 52
##
## 📌 m.cord_mark_width:
## 🔧 Type: num dbl | 📏 Units: mm
## 📝 Description: The width of a cordmark
## 🔢 Values: from 1 to 17
##
## 📌 m.cord_mark_type:
## 🔧 Type: txt fct | 📏 Units: mm
## 📝 Description: Cordmark type
## ⚠️ Variable not found in 'data'
##
## 📌 c.col_in_lip:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the lip part from inside
## 🔢 Values: carb, clear
##
## 📌 c.col_in_neck:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the neck part from inside
## 🔢 Values: carb, clear, water
##
## 📌 c.col_in_shoulder:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the shoulder part from inside
## 🔢 Values: carb, clear, water
##
## 📌 c.col_in_body:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the body part from inside
## 🔢 Values: carb, clear, water
##
## 📌 c.col_out_lip:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the lip part from outside
## 🔢 Values: carb, clear
##
## 📌 c.col_out_neck:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the neck part from outside
## 🔢 Values: carb, clear, oxide
##
## 📌 c.col_out_shoulder:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the shoulder part from outside
## 🔢 Values: carb, clear, oxide
##
## 📌 c.col_out_body:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the body part from outside
## 🔢 Values: carb, clear, oxide, water
##
## 📌 c.col_in_foot:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the foot part from inside
## 🔢 Values: carb, clear, water
##
## 📌 c.col_out_foot:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the foot part from outside
## 🔢 Values: carb, clear, oxide
##
## 📌 c.col_out_crotch:
## 🔧 Type: txt fct
## 📝 Description: A coloration of the crotch part from outside
## 🔢 Values: carb, clear, oxide
##
## 📌 c.group:
## 🔧 Type: txt ord
## 📝 Description: A classification according to carbonisation analysis
## 🔢 Values: dry, wet
##
## 📌 c.conf:
## 🔧 Type: num dbl | 📏 Units: %
## 📝 Description: A confidence level for 'Dry or Wet' classification, 1 means dead certainty
## 🔢 Values: from 0.3 to 1
##
## 📌 g.lipids_conc:
## 🔧 Type: num dbl | 📏 Units: %
## 📝 Description: A concentration of lipids in a sample, promille?
## 🔢 Values: from 0.9 to 14992.7
##
## 📌 g.scfa:
## 🔧 Type: num dbl | 📏 Units: %
## 📝 Description: Relative abundance of short-chain fatty acids (C2:C6)
## 🔢 Values: from 1.5 to 21.4
##
## 📌 g.mcfa:
## 🔧 Type: num dbl | 📏 Units: %
## 📝 Description: Relative abundance of medium-chain fatty acids (C6:C12)
## 🔢 Values: from 19.5 to 83.5
##
## 📌 g.lcfa:
## 🔧 Type: num dbl | 📏 Units: %
## 📝 Description: Relative abundance of long-chain fatty acids (>C12)
## 🔢 Values: from 0 to 50.9
##
## 📌 g.uns_fa:
## 🔧 Type: num dbl | 📏 Units: %
## 📝 Description: Relative abundance of unsaturated fatty acids
## 🔢 Values: from 0 to 33.2
##
## 📌 g.diacids:
## 🔧 Type: num dbl | 📏 Units: %
## 📝 Description: Relative abundance of dicarboxylic acids: oxidized fatty acids with two carboxyl groups
## 🔢 Values: from 0 to 14.8
##
## 📌 g.alkanes:
## 🔧 Type: num dbl | 📏 Units: %
## 📝 Description: Relative abundance of saturated hydrocarbons
## 🔢 Values: from 0 to 73.7
##
## 📌 g.p_s:
## 🔧 Type: num dbl
## 📝 Description: Pristane/Phytane ratio ??
## 🔢 Values: from 0.6 to 3.4
##
## 📌 g.o_s:
## 🔧 Type: num dbl
## 📝 Description: Odd-over-even carbon number predominance ??
## 🔢 Values: from 0 to 1.3
##
## 📌 g.aapac18e_h:
## 🔧 Type: num dbl
## 📝 Description: An index related to C18 alkylphenanthrene homologs (E/H variants) ??
## 🔢 Values: from 1.1 to 12.3
##
## 📌 g.srr:
## 🔧 Type: num dbl
## 📝 Description: A sterane ratio expressed as a percentage ??
## 🔢 Values: from 42.6 to 77.6
##
## 📌 g.group:
## 🔧 Type: txt ord
## 📝 Description: A classification for organics source based on expert analysis
## 🔢 Values: plant, mixture, animal
##
## 📌 g.conf:
## 🔧 Type: num dbl
## 📝 Description: A confidence level for organic source classification (g.group), 1 means dead certainty
## 🔢 Values: from 0.3 to 1
##
## 📌 g.plant_count:
## 🔧 Type: num int
## 📝 Description: A rowwise count for 'plant'
## 🔢 Values: from 0 to 7
##
## 📌 g.tree_count:
## 🔧 Type: num int
## 📝 Description: A rowwise count for 'tree'
## 🔢 Values: from 0 to 1
##
## 📌 g.cereal:
## 🔧 Type: lgl
## 📝 Description: True if 'cereal' was mention in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 g.fruit:
## 🔧 Type: lgl
## 📝 Description: True if 'fruit' was mention in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 g.vegetable:
## 🔧 Type: lgl
## 📝 Description: True if 'vegetable' was mention in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 g.resin:
## 🔧 Type: lgl
## 📝 Description: True if 'resin' was mention in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 g.fish:
## 🔧 Type: lgl
## 📝 Description: True if 'fish' was mention in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 g.complex_mxtr:
## 🔧 Type: lgl
## 📝 Description: True if 'complex mixture' was mention in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 p.optical_activity:
## 🔧 Type: lgl
## 📝 Description: True if optical activity is 'Active'
## 🔢 Values: FALSE
##
## 📌 p.mica:
## 🔧 Type: txt ord
## 📝 Description: The relative abundance of mica minerals
## 🔢 Values: None, No, Rare, Few, Moderate, Yes
##
## 📌 p.group:
## 🔧 Type: txt fct
## 📝 Description: A classification for petrography based on expert analysis
## 🔢 Values: CPX Sand, Feldspar Sand, Feldspar Sand + Lime, Fine Silt, Granitic + Sed, Granitic Sand, Granodiorite, Metamorphic, Sandstone, Tonalite, Volcanic
##
## 📌 p.lower_fr_bound:
## 🔧 Type: num dbl | 📏 Units: mm
## 📝 Description: A lower bound for rock fragments approximate size range
## 🔢 Values: from 0 to 1
##
## 📌 p.upper_fr_bound:
## 🔧 Type: num dbl
## 📝 Description: An upper bound for rock fragments approximate size range
## 🔢 Values: from 0.1 to 6
##
## 📌 p.lower_roundness_bound:
## 🔧 Type: txt ord
## 📝 Description: A lower bound for rock roundndess
## 🔢 Values: angular, subangular, subround, round
##
## 📌 p.upper_roundness_bound:
## 🔧 Type: txt ord
## 📝 Description: An upper bound for rock roundndess
## 🔢 Values: angular, subangular, subround, round
##
## 📌 p.granite:
## 🔧 Type: lgl
## 📝 Description: True if 'granite' was mentioned in the comments
## 🔢 Values: TRUE, FALSE
##
## 📌 p.granodiorite:
## 🔧 Type: lgl
## 📝 Description: True if 'granodiorite' was mentioned in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 p.diorite:
## 🔧 Type: lgl
## 📝 Description: True if 'diorite' was mentioned in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 p.sandstone:
## 🔧 Type: lgl
## 📝 Description: True if 'sandstone' was mentioned in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 p.mudstone:
## 🔧 Type: lgl
## 📝 Description: True if 'mudstone' was mentioned in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 p.limestone:
## 🔧 Type: lgl
## 📝 Description: True if 'limestone' was mentioned in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 p.gritstone:
## 🔧 Type: lgl
## 📝 Description: True if 'gritstone' was mentioned in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 p.volcanic:
## 🔧 Type: lgl
## 📝 Description: True if 'volcanic' was mentioned in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 p.andesite:
## 🔧 Type: lgl
## 📝 Description: True if 'andesite' was mentioned in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 p.microgranite:
## 🔧 Type: lgl
## 📝 Description: True if 'microgranite' was mentioned in the comments
## 🔢 Values: FALSE, TRUE
##
## 📌 p.ksp:
## 🔧 Type: lgl
## 📝 Description: True if 'ksp' was mentioned in the comments
## 🔢 Values: FALSE, TRUE
overvw_plots <- list.files(file.path(outpath, "overview"), full.names = TRUE)
imgs_markdown <- paste0("", collapse = "\n\n")
asis_output(imgs_markdown) # Generate clickableimage HTML tags imgs_html
Spectral analysis
Overview
GCMS data is represented with a continuous set of mass-spectra,
m/z spectra specifically, where m stands for
mass, and z stands for charge number of ions. We can
present the GCMS data for a random spectrum file as a
heatmap, where
larger intensity would mean darker color, and where
retention time would be X axis, and
m/z would be Y axis. We could first look at
the summed projections of such heatmap: summing by m(Y)
gives us Total Ion Chromotogram (TIC), i.e. how many ions
were detected for each time; summing by retention time (X) gives us
general m/z spectrum of the sample.
Spectra Checker Function:
spec_checker <- function(in_path, spec_path) {
# List mzML files
mzml_list <- list.files(path = spec_path, pattern = "\\.mzML$", full.names = TRUE)
if (length(mzml_list) == 0) stop("No mzML files were found in the folder")
# loadings
source("settings.r")
source("load_pckgs.r")
dependencies <- c("readxl", "dplyr", "stringr")
invisible(lapply(dependencies, load_pkg))
# read metadata
df <- readxl::read_excel(file.path(in_path, info_and_carb_data_file_name), na = "") %>%
dplyr::ungroup() %>%
dplyr::rename(
site = site_name,
id = aux_id
) %>%
dplyr::select(id, period, site)
# process filenames
mzml_tbl <- dplyr::tibble(path = mzml_list) %>%
dplyr::mutate(
filename = basename(mzml_list),
prefix = stringr::str_extract(filename, "^[A-Za-z]"),
num = stringr::str_extract(filename, "(?<=^[A-Za-z])\\d+"),
postfix = stringr::str_extract(filename, "(?<=\\d)(.*)(?=\\.mzML)"),
id = paste0(prefix, num)
) %>%
dplyr::select(path, id)
# Check filenames
missing_ids <- setdiff(mzml_tbl$id, df$id)
if (length(missing_ids) != 0) warning("Following files were not matched: ", paste(missing_ids, collapse = ", "))
# Merge metadata and file information:
data <- inner_join(df, mzml_tbl, by = "id")
if (nrow(data) == 0) stop("No filename matches the metadata ids.")
cat("Number of matched mzML files:", nrow(data), "\n")
return(data)
}
Spectrum Overviewer Function:
spec_overviewer <- function(index_tbl, num) {
# loadings
source("settings.r")
source("load_pckgs.r")
dependencies <- c("dplyr", "mzR", "ggplot2", "gridExtra", "magick")
invisible(lapply(dependencies, load_pkg))
# check if the specified file exists
file_path <- index_tbl$path[[num]]
if (!file.exists(file_path)) {
stop("The file ", file_path, " does not exist.")
}
file_name <- basename(file_path)
# open mzML file and extract info
message("Fetching data...")
ms <- openMSfile(file_path)
hdr <- header(ms)
peaks_data <- peaks(ms, 1)
nScans <- nrow(hdr)
# define the future plot parameters
rt_min <- min(hdr$retentionTime)
rt_max <- max(hdr$retentionTime)
# nScans <- length(hdr$retentionTime)
rt_bins <- seq(rt_min, rt_max, length.out = nScans + 1)
rt_midpoints <- (rt_bins[-1] + rt_bins[-length(rt_bins)])/2
rt_limits <- c(rt_min, rt_max)
mz_min <- min(peaks_data[, 1])
mz_max <- max(peaks_data[, 1])
nBinsMz <- round(mz_max- mz_min) + 1
mz_bins <- seq(mz_min, mz_max, length.out = nBinsMz + 1)
mz_midpoints <- (mz_bins[-1] + mz_bins[-length(mz_bins)])/2
# create an empty matrix to styore the intensities
heatmap_matrix <- matrix(0, nrow = nBinsMz, ncol = nScans)
# populate each scan's data to the heatmap matrix
for (i in seq_len(nScans)) {
rt <- hdr$retentionTime[i]
rt_bin <- findInterval(rt, rt_bins, rightmost.closed = TRUE)
if (rt_bin < 1 || rt_bin > nScans) next
peaks_data <- peaks(ms, i)
if (nrow(peaks_data) == 0) next
for (j in 1:nrow(peaks_data)) {
mz_val <- peaks_data[j, 1]
intensity <- peaks_data[j, 2]
mz_bin <- findInterval(mz_val, mz_bins, rightmost.closed = TRUE)
if (mz_bin < 1 || mz_bin > nBinsMz) next
heatmap_matrix[mz_bin, rt_bin] <- heatmap_matrix[mz_bin, rt_bin] + intensity
}
}
# log-transformation
heatmap_matrix_log <- log10(heatmap_matrix + 1)
# plotting
message("Plots building...")
# plot 1
heatmap <- data.frame(
rt = rep(rt_midpoints, each = nBinsMz),
mz = rep(mz_midpoints, times = nScans),
intensity = as.vector(heatmap_matrix_log)
)
p1 <- ggplot(
heatmap,
aes(x = rt, y = mz, fill = intensity)
) +
geom_tile() +
scale_fill_gradient(
low = "white",
high = "black",
name = "log10(Intensity)"
) +
labs(
title = paste("Intensity Spectrum for ", file_name, "(log10)"),
x = "Retention Time (sec)",
y = "m/z"
) +
scale_x_continuous(limits = rt_limits, expand = c(0, 0)) +
theme_minimal() +
theme(legend.position = "none")
# plot 2
tic <- hdr$totIonCurrent
df_tic <- data.frame(rt = hdr$retentionTime, tic = tic)
p2 <- ggplot(
df_tic,
aes(x = rt, y = tic)
) +
geom_line(col = "blue") +
labs(
title = paste("Total Ion Current Chromatogram for ", file_name),
x = "Retention Time (sec)",
y = "Total Ion Current"
) +
scale_x_continuous(limits = rt_limits, expand = c(0,0)) +
theme_minimal()
# plot 3
mass_spectrum <- rep(0, nBinsMz)
for (i in 1:nScans) {
peaks_data <- peaks(ms, i)
if (nrow(peaks_data) == 0) next
for (j in 1:nrow(peaks_data)) {
mz_val <- peaks_data[j, 1]
intensity <- peaks_data[j, 2]
mz_bin <- findInterval(mz_val, mz_bins, rightmost.closed = TRUE)
if (mz_bin >= 1 && mz_bin <= nBinsMz) {
mass_spectrum[mz_bin] <- mass_spectrum[mz_bin] + intensity
}
}
}
mass_spectrum <- data.frame(mz = mz_midpoints, intensity = mass_spectrum)
p3 <- ggplot(
mass_spectrum,
aes(x = mz, y = intensity)
) +
geom_line(col = "red") +
labs(
title = paste("Summed Mass Spectrum for ", file_name),
x = "Summed Intensity",
y = "m/z"
) +
theme_minimal() +
coord_flip()
# close the mzML file
mzR::close(ms)
# grid plotting
layout_matrix <- matrix(c(3, 1, NA, 2), nrow = 2, byrow = TRUE)
combined_plot <- gridExtra::arrangeGrob(p1, p2, p3, layout_matrix = layout_matrix)
# ggsave(
# filename = save_path,
# plot = combined_plot,
# width = 16,
# height = 16,
# dpi = 300
# )
# message("Image saved to: ", basename(save_path))
# img <- magick::image_read(save_path)
# plot(img)
results <- list(plot = combined_plot)
return(results)
}
source("spec_checker.r")
spec_index <- spec_checker(inpath, path) # checks the files and returns the list of verified paths & metadata
## Number of matched mzML files: 105
#print(spec_index)
source("spec_overviewer.r")
specovrw <- spec_overviewer(spec_index, 1) # provides a visualization of GCMS data for the first spectrum in the folder
## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (1.0.13)
## than is installed on your system (1.0.14). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
plot_saver(specovrw$plot, outpath, "spectrum_overview.png", 10, 10, 100)
PCA
We can now compare samples’ TICs using Principal component analysis
in order to find any clustering pattern achivied by rotation the initial
variables (ion intensity per each retention time) and presenting them as
principal components, which we then could attempt to interpret. The
function ticsnpeaks fetches the data, specifically, TICs
and peaks:
TICs are total ion chromatograms (the summed result of
sample scanning), peaks are retention time ranges where
these tics were showing some significant evaporation, the
sdnum parameter set to 3 means that
ticsnpeaks function selects only those retention times
where values exceed the value of mean tic + 3 standard deviations.
Teaks&Peaks Funtion:
ticsnpeaks <- function(index_tbl, sdnum = 3) {
# loadings
# dependencies
source("settings.r")
source("load_pckgs.r")
dependencies <- c("readxl", "dplyr", "mzR", "pracma")
invisible(lapply(dependencies, load_pkg))
# create empty lists
tic_list <- list()
peak_list <- list()
# loop over each sample in the index table
for (file in index_tbl$path) {
cat("Processing file:", basename(file), "\n")
ms <- mzR::openMSfile(file)
tic <- mzR::tic(ms)
tic_list[[basename(file)]] <- tic
threshold <- mean(tic$tic) + sdnum * sd(tic$tic)
# Identify all indices above threshold
idx_above <- which(tic$tic > threshold)
if (length(idx_above) == 0) {
cat("No peaks detected above the threshold.\n")
} else {
runs <- rle(diff(idx_above)) # find contiguous peaks
break_positions <- which(runs$values != 1) # > 1 mean new peak region
run_ends <- cumsum(runs$lengths)
boundaries <- c(0, run_ends[break_positions], length(idx_above))
# Loop over each contiguous peaks to find start, max, and end
peak_temp_list <- list()
for (i in seq_along(boundaries)[-1]) {
start_in_run <- boundaries[i-1] + 1
end_in_run <- boundaries[i]
these_idx <- idx_above[start_in_run:end_in_run]
peak_start <- min(these_idx)
peak_end <- max(these_idx)
local_max_index <- which.max(tic$tic[these_idx])
peak_max <- these_idx[local_max_index]
# Extract the spectrum at the scan with max tic
sp <- mzR::peaks(ms, scan = peak_max)
if (!is.null(sp) && nrow(sp) > 0) {
# Find the m/z value with the highest intensity in the spectrum
base_mz <- sp[which.max(sp[, 2]), 1]
} else {
base_mz <- NA
}
# Store results
peak_temp_list[[i]] <- tibble(
start_index = peak_start,
max_index = peak_max,
end_index = peak_end,
start_time = tic$time[peak_start],
max_time = tic$time[peak_max],
end_time = tic$time[peak_end],
max_peak_tic = tic$tic[peak_max],
base_mz = base_mz
) %>%
mutate(duration = end_time - start_time)
}
# Combine
peak_info <- do.call(rbind, peak_temp_list)
rownames(peak_info) <- NULL
peak_list[[basename(file)]] <- peak_info
}
close(ms)
}
return(list(tics = tic_list, peaks = peak_list))
}
source("ticsnpeaks.r")
e <- ticsnpeaks(spec_index, 3) # fetches tics and lists the high peaks
## Processing file: Y15A.mzML
## Processing file: Y19A.mzML
## Processing file: Y20A.mzML
## Processing file: Y26A.mzML
## Processing file: Y28A.mzML
## Processing file: Y29_A.mzML
## Processing file: Y33_A.mzML
## Processing file: Y34A.mzML
## Processing file: Y40A.mzML
## Processing file: Y49_A.mzML
## Processing file: Y50A.mzML
## Processing file: Y51_A.mzML
## Processing file: Y52_A.mzML
## Processing file: Y62A.mzML
## Processing file: Y78_A.mzML
## Processing file: Y83A.mzML
## Processing file: Y84A.mzML
## Processing file: Y85_A.mzML
## Processing file: Y91_A.mzML
## Processing file: Y117A.mzML
## Processing file: Y119A.mzML
## Processing file: Y134_A.mzML
## Processing file: Y151A.mzML
## Processing file: Y152_A.mzML
## Processing file: Y156_A.mzML
## Processing file: Y169A.mzML
## Processing file: Y172A.mzML
## Processing file: Y179A.mzML
## Processing file: Y215A.mzML
## Processing file: Y217_A.mzML
## Processing file: G4A.mzML
## Processing file: G5A.mzML
## Processing file: G9A.mzML
## Processing file: G10A.mzML
## Processing file: G19A.mzML
## Processing file: G20A.mzML
## Processing file: G21A.mzML
## Processing file: G22A.mzML
## Processing file: G27A.mzML
## Processing file: G28A.mzML
## Processing file: G30A.mzML
## Processing file: G34A.mzML
## Processing file: G35A.mzML
## Processing file: G39A.mzML
## Processing file: G41A.mzML
## Processing file: G42A.mzML
## Processing file: G47A.mzML
## Processing file: G48A.mzML
## Processing file: G49A.mzML
## Processing file: G55_A.mzML
## Processing file: G56A.mzML
## Processing file: G57A.mzML
## Processing file: G60A.mzML
## Processing file: G61A.mzML
## Processing file: G62A.mzML
## Processing file: G63A.mzML
## Processing file: G66A.mzML
## Processing file: G67A.mzML
## Processing file: G70_A.mzML
## Processing file: G71_A.mzML
## Processing file: G73A.mzML
## Processing file: G75A.mzML
## Processing file: G80A.mzML
## Processing file: G81A.mzML
## Processing file: G87A.mzML
## Processing file: G88A.mzML
## Processing file: G90A.mzML
## Processing file: G91A.mzML
## Processing file: G93A.mzML
## Processing file: G94A.mzML
## Processing file: G95A.mzML
## Processing file: G96A.mzML
## Processing file: G97A.mzML
## Processing file: G98A.mzML
## Processing file: G99A.mzML
## Processing file: G104A.mzML
## Processing file: K6A.mzML
## Processing file: K7A.mzML
## Processing file: K8A.mzML
## Processing file: K9A.mzML
## Processing file: K10A.mzML
## Processing file: K11A.mzML
## Processing file: K12A.mzML
## Processing file: K13A.mzML
## Processing file: K14A.mzML
## Processing file: K15A.mzML
## Processing file: K16A.mzML
## Processing file: K17A.mzML
## Processing file: K18A.mzML
## Processing file: K19A.mzML
## Processing file: K20A.mzML
## Processing file: K21A.mzML
## Processing file: K22A.mzML
## Processing file: K23A.mzML
## Processing file: K24A.mzML
## Processing file: K25A.mzML
## Processing file: K26A.mzML
## Processing file: K27A.mzML
## Processing file: K28A.mzML
## Processing file: K29A.mzML
## Processing file: K30A.mzML
## Processing file: K31A.mzML
## Processing file: K32A.mzML
## Processing file: K33A.mzML
## Processing file: K34A.mzML
summary(e) # the overall structure of function's output
## Length Class Mode
## tics 105 -none- list
## peaks 105 -none- list
head((e$tics[[1]]), n = 30) # print first 30 rows of TIC data for the first spectrum
## time tic
## 1 257.325 11874
## 2 257.426 13827
## 3 257.527 12015
## 4 257.628 13322
## 5 257.729 12393
## 6 257.831 12461
## 7 257.932 10035
## 8 258.033 9754
## 9 258.134 11972
## 10 258.236 10696
## 11 258.337 10185
## 12 258.439 11520
## 13 258.540 10274
## 14 258.641 9594
## 15 258.742 10720
## 16 258.844 9818
## 17 258.945 10534
## 18 259.046 10343
## 19 259.148 11482
## 20 259.249 9802
## 21 259.350 11165
## 22 259.452 10142
## 23 259.553 9610
## 24 259.655 10377
## 25 259.756 11029
## 26 259.857 12006
## 27 259.959 11370
## 28 260.060 12293
## 29 260.161 11131
## 30 260.262 9310
print(e$peaks[[1]]) # print the list of peaks for the first spectrum
## # A tibble: 3 × 9
## start_index max_index end_index start_time max_time end_time max_peak_tic
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 8061 8071 8083 1074. 1075. 1076. 3520526
## 2 9193 9200 9206 1188. 1189. 1190. 1644834
## 3 15351 15386 15413 1812. 1816. 1818. 8040740
## # ℹ 2 more variables: base_mz <dbl>, duration <dbl>
print(e$peaks[[2]]) # print the list of peaks for the second spectrum
## # A tibble: 4 × 9
## start_index max_index end_index start_time max_time end_time max_peak_tic
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 8059 8075 8087 1074. 1075. 1077. 7768377
## 2 9187 9210 9222 1188. 1191. 1192. 10623814
## 3 11192 11194 11194 1391. 1392. 1392. 1637062
## 4 15343 15378 15398 1812. 1816. 1818. 7868235
## # ℹ 2 more variables: base_mz <dbl>, duration <dbl>
The data for the PCA should not be skewed, so we tested various
approaches of descewing the data and chose the
Box-Cox transformation. The next function does this
transformation as well as consequent scaling
before proceeding to PCA analysis.
PCA of TICS Function:
process_tics <- function(
data = NULL,
index_tbl = NULL,
only_high_tic = TRUE,
expander = 10 ) {
if (is.null(data) || is.null(index_tbl)) stop("No data")
# loadings
# dependencies
source("settings.r")
savepath <- ifelse(exists("outpath"), outpath, NULL)
source("load_pckgs.r")
source("pc_plotter.r")
dependencies <- c("dplyr", "tidyr", "purrr", "IRanges", "e1071", "MASS", "ggplot2", "patchwork", "ggplotify", "ggpubr", "cowplot")
invisible(lapply(dependencies, load_pkg))
lookup_base_mz <- function(rt) {
all_peaks %>%
dplyr::filter(floor(start_time) <= ceiling(rt), ceiling(end_time) >= floor(rt)) %>%
dplyr::pull(base_mz)
}
all_peaks <- dplyr::bind_rows(data$peaks)
# interpolating
message("\nInterpolating...")
# Determine the common time range
common_min <- max(sapply(data$tics, function(df) min(df$time)))
common_max <- min(sapply(data$tics, function(df) max(df$time)))
n_points <- min(sapply(data$tics, nrow))
max_len_dif <- max(sapply(data$tics, nrow)) - n_points
# Define a common time grid (choose the number of points based on your desired resolution)
common_time <- seq(common_min, common_max, length.out = n_points)
cat("\tCommon time length set to ", n_points, "; the range is from ", common_min, "to", common_max, "\n")
cat("\tMaximum length difference between samples:", max_len_dif, "\n")
# Resample every TIC using linear interpolation:
tic_interp <- purrr::map(data$tics, function(df) {
approx(x = df$time, y = df$tic, xout = common_time)$y
})
tic_matrix <- do.call(rbind, tic_interp)
colnames(tic_matrix) <- as.character(round(common_time, 2))
cat("\tDimensions of interpoled tic matrix:", dim(tic_matrix), "\n")
# subset to only high tics
if (only_high_tic) {
message("\nSubsetting to high tics...")
hi_tic_ranges <- all_peaks %>%
dplyr::select(start_index, end_index) %>%
{ IRanges::IRanges(start = .$start_index, end = .$end_index) } %>%
IRanges::reduce() %>%
as.data.frame() %>%
dplyr::as_tibble() %>%
dplyr::arrange(start) %>%
dplyr::mutate( # expand the range just for case
start = start - expander,
end = end + expander,
width = end - start
)
selected_scans <- unlist(lapply(seq_len(nrow(hi_tic_ranges)), function(i) {
hi_tic_ranges$start[i]:hi_tic_ranges$end[i]
}))
cat("\tNumber of time ranges:", nrow(hi_tic_ranges), "; total range:", sum(hi_tic_ranges$width), "\n")
tic_matrix <- tic_matrix[, selected_scans, drop = FALSE]
cat("\tDimensions of subsampled tic matrix:", dim(tic_matrix), "\n")
}
# save the originbal colnames (ret time indices)
colnames <- colnames(tic_matrix)
# Box-Cox transformation and scaling
message("\nBox-Coxing and scaling...")
tic_matrix_boxcox <- apply(tic_matrix, 2, function(x) {
bc <- boxcox(x ~ 1, plotit = FALSE)
lambda <- bc$x[which.max(bc$y)]
if (abs(lambda) < 1e-6) {
# When lambda is very close to 0, use logarithm
return(log(x))
} else {
return((x^lambda - 1) / lambda)
}
})
tic_matrix <- matrix(tic_matrix_boxcox, nrow = nrow(tic_matrix), ncol = ncol(tic_matrix))
tic_matrix <- scale(tic_matrix)
colnames(tic_matrix) <- colnames # restore the colnames
# Check the skewness
skew_values <- apply(tic_matrix, 2, e1071::skewness)
cat("\tSkewness:\n")
print(summary(skew_values))
if (skew_values[4] > 1) stop("Skewness is larger than 1...")
# PCA
# analysis
message("\nPrincipal Component Analysis...")
pca_result <- prcomp(tic_matrix, center = FALSE, scale. = FALSE)
pca_summary <- summary(pca_result)
cat("\tSummary of the first five principal components:\n")
print(pca_summary$importance[, 1:5])
#plot( # The variance explained by PCs
# pca_result,
# type = "l",
# main = "Variance, explained by PCAs"
# )
# visualization
# Prepare the PCA data for visualisation
message("\nCooking the plot...")
pca_df <- as.data.frame(pca_result$x)
pca_df$file <- names(data$tics)
pca_plot <- index_tbl %>%
dplyr::mutate(file = basename(path)) %>%
dplyr::select(file, period, site) %>%
dplyr::inner_join(pca_df, by = "file")
# PC1 vs PC2 scores
p1_title <- ifelse(
only_high_tic,
"TIC principal component analysis, subseted for high tics",
"TIC principal component analysis"
)
p1 <- ggplot2::ggplot(
pca_plot,
ggplot2::aes(x = PC1, y = PC2, color = period, shape = site)
) +
ggplot2::geom_point(size = 3) +
ggplot2::theme_minimal() +
ggplot2::labs(
title = p1_title,
x = "PC1",
y = "PC2"
)
# first PCs loadings
t <- pc_plotter(pca_result, pc = 1)
p2 <- t$plot
top_loadings <- t$sel_loadings
t <- pc_plotter(pca_result, pc = 2)
p3 <- t$plot
top_loadings <- c(top_loadings, t$sel_loadings)
# create a table of ret times and baze m/z-s which are responsible for the observed clustering
noteworthy <- dplyr::tibble(
ret_time = sort(top_loadings),
base_mz = purrr::map_chr(
sort(top_loadings), ~ {
matching <- lookup_base_mz(.x)
if(length(matching) == 0) { NA } else { paste(unique(matching), collapse = ", ") }
})
)
# split the table
mid <- ceiling(nrow(noteworthy) / 2)
ntwrth_left <- noteworthy[1:mid, ]
ntwrth_right <- noteworthy[(mid + 1):nrow(noteworthy), ]
# Create table grobs
p4_left <- ggpubr::ggtexttable(ntwrth_left, rows = NULL, theme = ggpubr::ttheme("minimal")) +
ggplot2::theme(plot.margin = ggplot2::margin(5, 5, 5, 5))
p4_right <- ggpubr::ggtexttable(ntwrth_right, rows = NULL, theme = ggpubr::ttheme("minimal")) +
ggplot2::theme(plot.margin = ggplot2::margin(5, 5, 5, 5))
p4_table <- cowplot::plot_grid(p4_left, p4_right, ncol = 2, align = "hv")
p4_title <- cowplot::ggdraw() +
cowplot::draw_label("Noteworthy peaks", fontface = 'bold', size = 16, x = 0.5, hjust = 0.5)
p4 <- cowplot::plot_grid(p4_title, p4_table, ncol = 1, rel_heights = c(0.12, 1))
# Combine the plots
if (only_high_tic) {
combined_plot <- p3 + p1 + p4 + p2 + plot_layout(ncol = 2, nrow = 2)
} else {
combined_plot <- p1
}
results <- list(plot = combined_plot)
return(results)
}
source("process_tics.r")
ticz <- process_tics(data = e, index_tbl = spec_index, only_high_tic = F)
## Common time length set to 17856 ; the range is from 257.341 to 2069.707
## Maximum length difference between samples: 41
## Dimensions of interpoled tic matrix: 105 17856
## Skewness:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.46362 0.04883 0.10339 0.10481 0.15585 0.59073
## Summary of the first five principal components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 110.48748 38.79363 33.33107 22.56524 19.37091
## Proportion of Variance 0.68366 0.08428 0.06222 0.02852 0.02101
## Cumulative Proportion 0.68366 0.76795 0.83016 0.85868 0.87969
plot_saver(ticz$plot, outpath, "tics_all.png", 10, 10, 100)
No prominent clustering: With more variables than
samples, PCA can overfit to noise, leading to misleading principal
components. So our next step is to shrink the TIC data to only high
intencity peaks of TICs, which have already been calculated by
ticsnpeaks function above (the hot areas are
accumulated from all the samples), for that we call
process_tics function again with only_high_tic
parameter set to TRUE.
source("process_tics.r")
ticz <- process_tics(data = e, index_tbl = spec_index, only_high_tic = T)
## Common time length set to 17856 ; the range is from 257.341 to 2069.707
## Maximum length difference between samples: 41
## Dimensions of interpoled tic matrix: 105 17856
## Number of time ranges: 57 ; total range: 3480
## Dimensions of subsampled tic matrix: 105 3537
## Skewness:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.37168 0.06673 0.13890 0.14272 0.21547 0.59073
## Summary of the first five principal components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 48.39170 17.08743 14.22424 10.53953 9.445137
## Proportion of Variance 0.66207 0.08255 0.05720 0.03141 0.025220
## Cumulative Proportion 0.66207 0.74462 0.80183 0.83323 0.858460
plot_saver(ticz$plot, outpath, "tics_sel.png", 10, 10, 100)
Now we can see some clustering in the space of the first two
principal components, where the period (Shang & Zhou) might also
have an impact. First five PC-s (out of 105) explain most of the
variance, and first two of them have the most pronounced power. Each
principal component is a linear combination of all original variables
(total ion intencities per retention time), where each
variable is multiplied by a specific coefficient known as a
loading. The variables with bigger absolute loadings in PC1
and PC2 have stronger impact on visual clusterization observed at the
plot of PC1 and PC2. We have selected those variables, which were 0.95
to maximum loading in PC1 and PC2 and listed them as a
noteworthy peaks retention times list, the baze_mz is also
presented for those peaks. The baze_mz is what is the m/z value of the
most intense peak in a given retention time. It might be
interesting to understand if the discovered noteworthy peaks do have
some meaning for the residue analysis.